remember to change “root” to your home directory!
# load packages
pkgs = c("ape", "phangorn", "seqinr", "phytools",
"plotly", "dplyr", "tidyr", "Matrix",
"geosphere", "leaflet")
pkgs_ui = setdiff(pkgs, rownames(installed.packages()))
if (length(pkgs_ui) > 0) install.packages(pkgs_ui, verbose=F)
sapply(pkgs, require, character.only=T)
# set paths
root = "~/projects/ai4all-sfu_bio"
seq_dir = paste(root, "/data/FASTA.fa", sep="") #original genetic sequence
meta_dir = paste(root, "/data/meta.csv", sep="")
align_dir = paste(root, "/data/alignment.fa", sep="")
result_dir_ = paste(root, "/result", sep=""); dir.create(result_dir_, showWarnings=F)
# use a previous result; set to "" if starting anew
result_time_ = "2018-07-09_20-09-04"
# options
nseq = 200 #number of sequences to sample; 2*nseq < total # of sequences## ape phangorn seqinr phytools plotly dplyr tidyr
## TRUE TRUE TRUE TRUE TRUE TRUE TRUE
## Matrix geosphere leaflet
## TRUE TRUE TRUE
# load sequence alignments; to align sequences, install mafft @ (https://mafft.cbrc.jp/alignment/software/)
if (!file.exists(align_dir)) system(paste("mafft ", seq_dir, " > ", align_dir, sep=""))
align = read.dna(align_dir, format="fasta")
strains_id = rownames(align)
strains = sapply(strsplit(strains_id,"[-]"), function(x) x[1])
dates = as.Date(sapply(strsplit(strains_id,"[-]"), function(x) x[2]))#sample for more recent strains
set.seed(17)
result_time_ = ifelse(result_time_!="" & !dir.exists(paste(result_dir_, "/", result_time_, sep="")), "", result_time_)
result_time = ifelse(result_time_=="", gsub(" ", "_", gsub("[:]", "-", Sys.time())), result_time_)
result_dir = paste(result_dir_, "/", result_time, sep=""); dir.create(result_dir, showWarnings=F)
ind_dir = paste(result_dir, "/ind.csv", sep="")
if (result_time_ == "") {
sam_ind = rownames(align)[sample(c(1:min(2*nseq,nrow(align))), nseq, replace=F, prob=NULL)]
write.csv(sam_ind, file=ind_dir, row.names=F)
}
sam_ind = read.csv(ind_dir, stringsAsFactors=F)[,1]
sam_ind## [1] "CY234578-2017/04/17" "CY222989-2017/02/14" "CY228725-2017/03/19"
## [4] "CY224126-2017/02/26" "CY228541-2017/03/23" "CY227534-2017/03/13"
## [7] "CY232562-2017/04/11" "CY232547-2017/04/13" "CY228301-2017/02/26"
## [10] "CY230826-2017/04/13" "CY228749-2017/03/21" "CY238923-2017/06/16"
## [13] "CY227254-2017/02/23" "CY227318-2017/02/23" "CY221910-2017/02/16"
## [16] "CY223149-2017/02/17" "CY228893-2017/03/09" "CY230730-2017/04/04"
## [19] "CY227670-2017/03/08" "CY227814-2017/03/15" "CY223277-2017/02/22"
## [22] "CY224062-2017/02/14" "CY228461-2017/03/02" "CY223085-2017/02/27"
## [25] "CY223261-2017/02/22" "CY234810-2017/05/01" "CY231099-2017/03/13"
## [28] "CY223926-2017/02/15" "CY223157-2017/02/20" "CY223021-2017/02/13"
## [31] "CY230690-2017/03/20" "CY235920-2017/05/14" "CY228373-2017/02/27"
## [34] "CY235864-2017/03/29" "CY227630-2017/03/14" "CY236035-2017/04/08"
## [37] "CY225358-2017/03/06" "CY230229-2017/03/07" "CY231125-2017/03/06"
## [40] "CY223069-2017/02/28" "CY225246-2017/03/05" "CY230594-2017/03/10"
## [43] "CY234722-2017/04/09" "CY227798-2017/03/16" "CY230890-2017/04/10"
## [46] "CY235744-2017/04/07" "CY227278-2017/02/22" "MF182513-2017/02/15"
## [49] "CY230882-2017/04/05" "CY227878-2017/03/16" "CY223133-2017/02/26"
## [52] "CY234842-2017/04/27" "CY227806-2017/03/17" "CY230770-2017/04/11"
## [55] "CY238354-2017/03/14" "KY949821-2017/02/14" "CY225406-2017/03/12"
## [58] "CY234698-2017/04/17" "CY222885-2017/02/16" "CY235824-2017/05/08"
## [61] "CY235752-2017/04/28" "CY234192-2017/03/13" "CY232618-2017/04/21"
## [64] "CY235680-2017/04/25" "CY234498-2017/04/17" "CY232372-2017/03/16"
## [67] "KY949985-2017/02/13" "CY232594-2017/04/02" "CY238851-2017/05/08"
## [70] "CY228685-2017/03/15" "CY232578-2017/04/03" "CY225238-2017/03/06"
## [73] "CY234474-2017/04/15" "CY238298-2017/03/06" "CY235624-2017/04/17"
## [76] "MF182482-2017/03/22" "CY227902-2017/03/21" "CY235792-2017/05/07"
## [79] "CY232492-2017/04/10" "CY227590-2017/03/06" "KY950048-2017/02/14"
## [82] "CY238907-2017/06/07" "CY238569-2017/03/31" "CY230277-2017/03/10"
## [85] "CY232310-2017/03/29" "CY223878-2017/02/13" "CY227662-2017/03/02"
## [88] "CY230834-2017/03/20" "CY231012-2017/02/20" "CY227870-2017/02/22"
## [91] "CY227414-2017/02/23" "MF182524-2017/03/10" "CY234272-2017/02/27"
## [94] "CY232364-2017/03/30" "CY228933-2017/03/30" "CY228589-2017/03/19"
## [97] "CY221942-2017/02/15" "CY227638-2017/03/14" "CY235712-2017/04/25"
## [100] "CY230021-2017/02/18" "CY224086-2017/02/26" "CY227174-2017/02/20"
## [103] "CY230101-2017/02/26" "CY230069-2017/03/07" "CY223213-2017/02/22"
## [106] "CY227454-2017/03/08" "CY228549-2017/03/28" "CY232356-2017/04/04"
## [109] "CY234562-2017/04/19" "CY235848-2017/04/18" "CY225230-2017/03/06"
## [112] "CY230842-2017/03/31" "CY226998-2017/02/22" "CY235400-2017/04/27"
## [115] "CY230786-2017/04/03" "CY227262-2017/02/21" "CY230117-2017/03/09"
## [118] "CY227302-2017/02/27" "CY225334-2017/03/03" "CY238931-2017/05/22"
## [121] "CY224110-2017/02/27" "MF182461-2017/03/23" "CY234858-2017/04/19"
## [124] "CY232380-2017/03/20" "CY238362-2017/03/21" "CY230285-2017/02/20"
## [127] "CY234690-2017/04/24" "CY223189-2017/02/28" "CY228581-2017/03/23"
## [130] "CY228797-2017/03/20" "CY228845-2017/03/26" "CY233370-2017/03/13"
## [133] "CY227038-2017/02/16" "CY235912-2017/05/22" "CY228757-2017/03/13"
## [136] "CY232468-2017/03/31" "CY231077-2017/03/13" "CY223061-2017/02/27"
## [139] "CY223237-2017/02/25" "CY224094-2017/02/23" "CY234408-2017/03/20"
## [142] "CY230666-2017/03/20" "CY227846-2017/03/02" "CY227742-2017/02/25"
## [145] "CY235976-2017/05/15" "CY235968-2017/05/12" "MF182504-2017/02/22"
## [148] "CY228533-2017/03/20" "CY234522-2017/03/28" "CY233432-2017/03/22"
## [151] "CY232524-2017/04/08" "CY227294-2017/02/26" "CY238915-2017/06/13"
## [154] "CY227326-2017/03/01" "CY227838-2017/03/25" "CY230301-2017/02/27"
## [157] "CY224070-2017/02/22" "CY227830-2017/03/08" "CY231091-2017/03/09"
## [160] "CY225270-2017/02/27" "CY238875-2017/05/30" "CY230530-2017/02/22"
## [163] "CY230357-2017/03/21" "CY227718-2017/03/13" "MF182532-2017/02/21"
## [166] "CY223317-2017/02/27" "CY227206-2017/02/14" "CY227622-2017/03/07"
## [169] "CY234746-2017/05/09" "CY234514-2017/04/05" "CY234482-2017/03/24"
## [172] "CY234874-2017/04/29" "CY233259-2017/03/20" "CY235472-2017/03/04"
## [175] "CY234530-2017/03/30" "CY233448-2017/03/27" "CY234602-2017/04/11"
## [178] "CY238561-2017/03/29" "CY232540-2017/03/31" "CY234834-2017/04/30"
## [181] "CY232340-2017/03/23" "CY231118-2017/03/09" "CY224118-2017/03/06"
## [184] "CY230602-2017/03/14" "CY223125-2017/02/22" "CY225422-2017/03/13"
## [187] "CY227006-2017/02/22" "CY235568-2017/03/27" "CY232610-2017/04/16"
## [190] "CY230714-2017/03/06" "CY227686-2017/03/05" "CY222557-2017/02/23"
## [193] "CY234818-2017/05/08" "CY227446-2017/03/02" "CY234506-2017/04/16"
## [196] "CY232332-2017/03/30" "CY232324-2017/03/30" "KY950005-2017/02/14"
## [199] "CY238662-2017/04/17" "CY223053-2017/02/24"
calculate distance between sequences
align_n = align[sam_ind,]
align_n = as.phyDat(align_n)
dm = dist.ml(align_n, model="JC69")infer the tree using upgma
tree_UPGMA = upgma(dm)
plot(tree_UPGMA, show.tip.label=F, main="UPGMA")infer the tree using nj
# mt = modelTest(align_n)
# print(mt)
tree_NJ = NJ(dm)
plot(tree_NJ, main="Phylogeny: Neighbour Joining", show.tip.label=F)tree = ladderize(tree_NJ)
plot(tree, main="Phylogeny: Neighbour Joining", show.tip.label=F)is.rooted(tree_NJ) #is tree rooted
is.binary(tree_NJ) #are tree branches binary## [1] FALSE
## [1] TRUE
but which of these trees is a better fit for your data? Using the parsimony() function, you can compare their respective parsimony scores.
parsimony(tree_UPGMA, align_n)
parsimony(tree_NJ, align_n)## [1] 598
## [1] 561
optim.parsimony tries to find the maximum parsimony tree using either Nearest Neighbor Interchange (NNI) rearrangements or sub tree pruning and regrafting (SPR).
tree_optim = optim.parsimony(tree_NJ,align_n)
tree_pratchet = pratchet(align_n)
plot(tree_optim, main="Phylogeny: Maximum Parsimony NNI", show.tip.label=F)plot(tree_pratchet, main="Phylogeny: Maximum Parsimony SPR", show.tip.label=F)## Final p-score 559 after 2 nni operations
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
## [1] "Best pscore so far: 559"
maximum likelihood methods allow you to include the full data from your sequence alignment in a statistical framework that estimates model parameters.
the first thing we need to do is create a special type of object of class “pml”. This object includes mostly our tree and data, but also parameters of an evolutionary model, their values (usually set to some default), and the likelihood of the model.
fit = pml(tree_NJ,align_n)## negative edges length changed to 0!
print(fit)
plot(fit, main="Phylogeny: Neighbour Joining (PML)", show.tip.label=F)##
## loglikelihood: -7200.111
##
## unconstrained loglikelihood: -5500.084
##
## Rate matrix:
## a c g t
## a 0 1 1 1
## c 1 0 1 1
## g 1 1 0 1
## t 1 1 1 0
##
## Base frequencies:
## 0.25 0.25 0.25 0.25
function optim.pml() will optimize tree topology and branch length for your selected model of nucleotide evolution.
fitJC = optim.pml(fit, model="JC", rearrangement="stochastic")
logLik(fitJC)
plot(fitJC, main="Phylogeny: Neighbour Joining Optimized (PML)", show.tip.label=F)## optimize edge weights: -7200.111 --> -7175.895
## optimize edge weights: -7175.895 --> -7175.895
## optimize topology: -7175.895 --> -7165.78
## optimize topology: -7165.78 --> -7165.78
## optimize topology: -7165.78 --> -7164.921
## 4
## optimize edge weights: -7164.921 --> -7164.921
## optimize topology: -7164.921 --> -7164.921
## 0
## [1] "Ratchet iteration 1 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration 2 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration 3 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration 4 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration 5 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration 6 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration 7 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration 8 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration 9 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration 10 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration 11 , best pscore so far: -7164.92050417452"
## [1] "Ratchet iteration 12 , best pscore so far: -7164.92050394874"
## [1] "Ratchet iteration 13 , best pscore so far: -7164.92050394874"
## [1] "Ratchet iteration 14 , best pscore so far: -7164.92050383717"
## [1] "Ratchet iteration 15 , best pscore so far: -7164.92050383717"
## [1] "Ratchet iteration 16 , best pscore so far: -7164.92050383717"
## [1] "Ratchet iteration 17 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 18 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 19 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 20 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 21 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 22 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 23 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 24 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 25 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 26 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 27 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 28 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 29 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 30 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 31 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 32 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 33 , best pscore so far: -7164.92050374767"
## [1] "Ratchet iteration 34 , best pscore so far: -7164.920503434"
## [1] "Ratchet iteration 35 , best pscore so far: -7164.920503434"
## [1] "Ratchet iteration 36 , best pscore so far: -7164.920503434"
## [1] "Ratchet iteration 37 , best pscore so far: -7164.92049494019"
## [1] "Ratchet iteration 38 , best pscore so far: -7164.92049494019"
## [1] "Ratchet iteration 39 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 40 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 41 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 42 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 43 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 44 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 45 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 46 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 47 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 48 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 49 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 50 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 51 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 52 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 53 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 54 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 55 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 56 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 57 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 58 , best pscore so far: -7164.92049435348"
## [1] "Ratchet iteration 59 , best pscore so far: -7164.92049435348"
## optimize edge weights: -7164.92 --> -7164.92
## optimize topology: -7164.92 --> -7164.92
## 0
## optimize edge weights: -7164.92 --> -7164.92
## 'log Lik.' -7164.92 (df=397)
note that the object returned by optim.pml is not a phylogeny, but an optimized object of class “pml” (which contains an optimized phylogeny with edge lengths). Let’s plot this phylogeny:
tree_JC = fitJC$tree
plot(tree_JC, type="phylogram", lab4ut="horizontal", edge.width = 2, show.tip.label=F)# what tree do we want to do ancestral sequence reconstruction for?
tree_type = "NJ"
tree = tree_NJ
# convert alignment format
align_npd = phyDat(align_n)
# parsimony reconstructions: based on the fitch algorithm for bifurcating trees (note: there will be often no unique solution)
anc = ancestral.pars(tree, align_npd, "ACCTRAN")
plotAnc(tree, anc, cex.pie=.5, show.tip.label=F)
title(paste0(tree_type, ": ACCTRAN"))align known influenza virus with ancestral sequences; known strains must have a date preceding their child strains; we assume there are at least 2 layers in a tree
# get sequences ready (combine ancestral with known strains)
seqa_dir = paste(result_dir, "/FASTA_anc.fa", sep="")
sequ_anc = as.DNAbin(lapply(anc, function(x) colnames(x)[apply(x,1,function(y) which.max(y))] ))
write.dna(sequ_anc, file=seqa_dir, format="fasta")
sequ = read.dna(seq_dir, format="fasta")
sequ_anc = read.dna(seqa_dir, format="fasta", as.matrix=F)
seq_all_dir = paste(result_dir, "/FASTA_all.fa", sep="")
sequ_all = append(sequ_anc, sequ)
sequ_all = sequ_all[!(duplicated(names(sequ_all)) | duplicated(names(sequ_all),fromLast=T))] # leaf sequences not included in distance calulcation
write.dna(sequ_all, file=seq_all_dir, format="fasta")
# align sequences, install mafft @ (https://mafft.cbrc.jp/alignment/software/)
align_anc_dir = paste(result_dir, "/alignment_anc.fa", sep="")
if (!file.exists(align_anc_dir)) system(paste("mafft ", seq_all_dir, " > ", align_anc_dir, sep=""))
align_anc = read.dna(align_anc_dir, format="fasta")calculate distance (ancestral sequences x known sequences) from alignments
# get distances ready
dm_anc_dir = paste(result_dir, "/dm_anc.Rdata", sep="")
if (!file.exists(dm_anc_dir)) {
dm_anc_ = dist.ml(phyDat(align_anc),model="JC69")
dm_anc = as.matrix(dm_anc_, sparse=T)
dm_anc = dm_anc[!rownames(dm_anc)%in%strains_id, colnames(dm_anc)%in%strains_id]
save(dm_anc, file=dm_anc_dir)
}
dm_anc = get(load(dm_anc_dir))
strains_anc = sapply(strsplit(colnames(dm_anc),"[-]"), function(x) x[1])
dates_anc = as.Date(sapply(strsplit(colnames(dm_anc),"[-]"), function(x) x[2]))
names(dates_anc) = strains_anclabel tree nodes
evo_month = 444 # max number of months needed for a parent strain to evolve into a child strain (max = 444)
# load metadata
meta = read.csv(meta_dir, stringsAsFactors=F)[,-1]
rownames(meta) = meta[,"strain"]
# label nodes starting from root using depth first search
edges = tree$edge[order(tree$edge[,2]),] #first nseq nodes are leaves
edges[1:nseq,2] = tree$tip.label
edges[,2] = sapply(strsplit(edges[,2], "[-]"), function(x) x[1])
tree_labelled = T
nodes = unique(as.vector(edges))
node = unique(edges[!edges[,1]%in%edges[,2],1]) #root
while (sum(!nodes%in%strains)>0) {
children = sapply(strsplit(edges[edges[,1]==node, 2], "[-]"), function(x) x[1])
if (any(!children%in%strains)) {
node = children[!children%in%strains]
node = node[1]
} else {
min_date = min(as.Date(as.character(meta[children, "date"])))
min_date_ = seq(min_date, length=2, by="-6 months")[2]
dm_colind = which(dates_anc < min_date & dates_anc > min_date_ &
!names(dates_anc)%in%edges)
if (length(dm_colind)==0) {
print("incomplete! need data on older strains")
tree_labelled = F; break()
}
edges[edges==node] = nodes[nodes==node] = node_strain =
names(dm_colind[which.min(dm_anc[node, dm_colind])])
# nodes_in = setdiff(nodes_in, node)
node = edges[edges[,2]==node_strain, 1]
}
}prepare nodelist virus & edgelist vphy
# viral strains
virus = meta[nodes, c("strain", "subtype", "date", "lng", "lat")]
# viral phylogeny
vphy = data.frame(id=seq_len(nrow(edges)),
start_lng=meta[edges[,1], "lng"], start_lat=meta[edges[,1], "lat"],
end_lng=meta[edges[,2], "lng"], end_lat=meta[edges[,2], "lat"])plot map
edges are coloured based on the child strain; red -> yellow (ancestor -> new strains)
# heat colours: red (early) to yellow (recent)
colours = sapply(heat.colors(length(unique(virus$date))), function(x) gsub("FF$","",x))
names(colours) = sort(unique(virus$date))
# phylogeny edges
geo_lines = gcIntermediate(vphy[,c("start_lng", "start_lat")],
vphy[,c("end_lng", "end_lat")],
n=200, addStartEnd=T, sp=T, breakAtDateLine=T)
# map
leaflet() %>%
addMiniMap(tiles = providers$Esri.OceanBasemap, width = 120, height=80) %>%
addProviderTiles(providers$Esri.OceanBasemap) %>%
addCircleMarkers(data=virus, lng=~lng, lat=~lat,
color = as.vector(colours[virus[,"date"]]), weight=2, radius=2,
label=paste(virus[,"subtype"], " (",
virus[,"strain"], ") ",
virus[,"date"], sep="")) %>%
addPolylines(data=geo_lines, color = as.vector(colours[meta[edges[,2],"date"]]),
weight = 1, opacity = 0.5, fill = FALSE, fillOpacity = 0.5, label = NULL)keep in mind that our data set contains monstly strains found in he USA, therefore most of the strains on our phylogeny will be in the USA; this could be because USA has the most flu strains, or a more practical reason can be simply because USA collects the most data!
let’s see where the root strain comes from
# display root and leaf nodes
meta[unique(edges[!edges[,1]%in%edges[,2],1]),] # root strain
# meta[unique(edges[!edges[,2]%in%edges[,1],2]),] # leaf strains